#!/usr/bin/env python2
import os
import sys
import socket
import time
import getopt
import subprocess
import fcntl
import types
import errno
import random
import threading
import commands
import re
from time import sleep, ctime

admin = os.path.abspath(os.path.split(os.path.realpath(__file__))[0] + '/../admin')
sys.path.insert(0, admin)

from utils import  _get_value, Exp, _dwarn, _dmsg, _derror, _exec_system, _exec_pipe
from global_variable import dock_list, site_count, rack_count, host_count
#from test_list import dock_cmd,

VALGRIND_CMD = " valgrind --tool=memcheck --leak-check=full --show-reachable=yes --time-stamp=yes -v "
VALGRIND_KEYWORD = "llegal\|inappropriate\|inadequate\|verlapping\|emory leak\|overlap\|Invalid read\|Invalid write\|definitely"

VALGRIND_MAX = 4
VALGRIND_LIST = []

def node_online(docker):
    pass

def node_stop(docker, valgrind):
    cmd = "docker exec %s lich.node --stop" % (docker)
    _exec_system(cmd)

def node_rmshm(docker):
    cmd = "docker exec %s rm -rf /dev/shm/lich4/" % (docker)
    _exec_system(cmd)

def node_start(docker, valgrind):
    cmd = "docker exec %s lich.node --start" % (docker)
    valcmd = 'docker exec -dit %s bash -c "%s  /opt/fusionstack/lich/sbin/lichd --home /opt/fusionstack/data -f  >> /opt/fusionstack/log/lich.log 2>&1 "' % (docker, VALGRIND_CMD)

    if valgrind:
        if len(VALGRIND_LIST) < VALGRIND_MAX:
            cmd = valcmd
            if docker not in VALGRIND_LIST:
                VALGRIND_LIST.append(docker)
        elif len(VALGRIND_LIST) == VALGRIND_MAX:
            if docker in VALGRIND_LIST:
                cmd = valcmd

    _exec_system(cmd)

def node_check(docker):
    retry = 0;
    while (1):
        cmd = "docker exec %s lich.node --stat" % (docker)
        status, output = commands.getstatusoutput(cmd)
        if status == 0 and output == "running":
            return True
        elif retry < 3:
            sleep(1)
            retry = retry + 1;
        else:
            _derror("status: %s, output: %s" % (status, output))
            retry = retry + 1;
            return False

def fail_exit(msg):
    _derror(msg)
    _exec_system('for pid in `ps -ef | grep test_list | grep -v grep | cut -c 9-15`; do kill -9 $pid; done', False)
    _exec_system('for pid in `ps -ef | grep test.py | grep -v grep | cut -c 9-15`; do kill -9 $pid; done', False)
    _exec_system('for pid in `ps -ef | grep "lich.inspect --recover" | grep -v grep | cut -c 9-15`; do kill -9 $pid; done', False)
    _exec_system('kill -9 ' + str(os.getpid()))

def node_check_core(docker):
    cmd = "docker exec %s ls -l /tmp/core 2> /dev/null| grep lich | wc -l" % (docker)
    #_dmsg(cmd)
    status, output = commands.getstatusoutput(cmd)
    if status == 0 and output == "0":
        return True
    else:
        if (output.find("starting container process caused") != -1):
            _derror("docker bug")
            return True
        else:
            fail_exit('fail sim error: %s coredump' % docker)

def node_check_valgrind(docker):
    cmd = "docker exec %s grep '%s' /opt/fusionstack/log/lich.log | grep -v 'definitely lost: 0 bytes'" % (docker, VALGRIND_KEYWORD)
    #_dmsg(cmd)
    status, output = commands.getstatusoutput(cmd)
    if status != 0:
        return True
    else:
        if (output.find("starting container process caused") != -1):
            _derror("docker bug")
            return True
        else:
            fail_exit('fail sim error: %s valgrind:%s' % (docker, output))

def get_rack_prefix(host_name):
    regrex = '(.*?).host*'
    pattern = re.compile(regrex, re.S)
    items = re.findall(pattern, host_name)
    return items[0]

def get_site_prefix(host_name):
    regrex = '(.*?).rack*'
    pattern = re.compile(regrex, re.S)
    items = re.findall(pattern, host_name)
    return items[0]

class Fail:
    def __init__ (self, scan_interval, valgrind):
        self.scan_interval = scan_interval
        self.running = False
        self.t = threading.Thread(target=self.__loop)
        self.c = threading.Thread(target=self.__check)
        self.valgrind = valgrind

    def check_start(self):
        self.c.start()

    def check_stop(self):
        self.c.join()
    def __check(self):
        while (self.running):
            sleep(3)
            #_dmsg("xxxxxxxxxxxxx")
            for i in dock_list:
                node_check_core(i)
                if (self.valgrind):
                    node_check_valgrind(i)

        _dmsg("core check thread stoped %u" % self.running)

    def __scan__(self, pool):
        #_dmsg("scan %s begin\n" % pool)

        while (self.running):
            cmd = "docker exec %s /opt/fusionstack/lich/libexec/lich.inspect --recover /%s/ --deep" % (dock_list[0], pool)
            ret = subprocess.call(cmd, shell=True)
            if (ret):
                if (ret == errno.EAGAIN or  ret == 126 or ret == 28 or ret == 1):
                    time.sleep(1)
                    continue;
                elif (ret == 136):
                    return
                else:
                    raise Exp(ret, "_scan fail: ret: %d" % ret)
            else:
                break

        #_dmsg("scan %s end\n" % pool)

    def __scan(self):
        while (self.running):
            cmd = "docker exec %s /opt/fusionstack/lich/libexec/lich.inspect --recover / --deep" % (dock_list[0])
            ret = subprocess.call(cmd, shell=True)
            if (ret):
                if (ret == errno.EAGAIN or  ret == 126 or ret == 28 or ret == 1):
                    time.sleep(1)
                    continue;
                elif (ret == 136):
                    return
                else:
                    raise Exp(ret, "_scan fail: ret: %d" % ret)
            else:
                break
            
        """
        cmd = "docker exec %s lichbd pool ls" % (dock_list[0])
        res = _exec_pipe(cmd.split(' '), retry = 10, timeout=60)
        lst = res.split('\n')[:-1]
        for i in lst:
            self.__scan__(i)
        """

    def __fail(self, scan, seq):
        _dmsg("fail_simulate %u time %f begin ..." %(seq, time.time()))
        stopped_list = []

        if site_count == 1 and rack_count == 1:
            #host level
            _dmsg('host level-->')
            i = random.randint(0, len(dock_list) - 1)
            node_stop(dock_list[i], self.valgrind)
            stopped_list.append(dock_list[i])
        elif site_count == 1 and rack_count != 1:
            #rack level
            _dmsg('rack level-->')
            host_name = random.sample(dock_list, 1)
            host_prefix = get_rack_prefix(host_name[0])

            for host in dock_list:
                if host_prefix in host:
                    node_stop(host, self.valgrind)
                    stopped_list.append(host)
        elif site_count != 1 and site_count != 0 :
            #site level
            _dmsg('site level-->')
            host_name = random.sample(dock_list, 1)
            host_prefix = get_site_prefix(host_name[0])

            for host in dock_list:
                if host_prefix in host:
                    node_stop(host, self.valgrind)
                    stopped_list.append(host)
        else:
            _derror('invalid dock_list')
            sys.exit(errno.EINVAL)

        if dock_list[0] in stopped_list:
            _dwarn("force start %s" % dock_list[0])
            node_start(dock_list[0], self.valgrind)
            
        sleep(10)
        self.__scan()
        self.__scan()

        for host in stopped_list:
            node_start(host, self.valgrind)

        _dmsg("fail_simulate %u time %f end ..." %(seq, time.time()))
        sleep(10)

    def __loop(self):
        i = 0
        while self.running:
            sleep(2)
            try:
                #self.__fail((i + 1) % self.scan_interval == 0, i)
                self.__fail(1, i)
            except Exp, e:
                _derror('fail sim error: %s' % e.err)
                _exec_system('for pid in `ps -ef | grep test_list | grep -v grep | cut -c 9-15`; do kill -9 $pid; done', False)
                _exec_system('for pid in `ps -ef | grep test.py | grep -v grep | cut -c 9-15`; do kill -9 $pid; done', False)
                _exec_system('kill -9 ' + str(os.getpid()))
            i = i + 1

        _dmsg(" fail sim thread stopped %u"  % self.running)

    def start(self, nofail):
        self.running = True
        self.c.start()
        if (nofail == False):
            _dmsg('fail sim start')
            self.t.start()
            _dmsg('fail sim started')
            

    def stop(self, nofail):
        self.running = False
        self.c.join()
        if (nofail == False):
            _dmsg('fail sim stop')
            self.t.join()
            _dmsg('fail sim stoped')

def usage():
    print ("usage:")
    print (sys.argv[0] + " --scan_interval [interval]")


def main():
    try:
        opts, args = getopt.getopt(
            sys.argv[1:],
            'hv', ['scan_interval=']
        )
    except getopt.GetoptError, err:
        print str(err)
        usage()
        exit(errno.EINVAL)

    for o, a in opts:
        if o in ('--help'):
            usage()
            exit(0)
        elif o in ('-v', '--verbose'):
            verbose = 1
        elif o in ('--scan_interval'):
            interval = int(a)

    fail = Fail(interval)
    fail.start(False)

    sleep(interval)

    fail.stop(False)

    _dmsg('fail sim finished')

if __name__ == '__main__':
    if (len(sys.argv) == 1):
        usage()
    else:
        main()
